Importation des bibliotheques¶

In [ ]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap, MarkerCluster

from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

Chargement des données¶

In [ ]:
airbnb = pd.read_csv("airbnb_train.csv")
airbnb.head()
Out[ ]:
id log_price property_type room_type amenities accommodates bathrooms bed_type cancellation_policy cleaning_fee ... last_review latitude longitude name neighbourhood number_of_reviews review_scores_rating zipcode bedrooms beds
0 5708593 4.317488 House Private room {TV,"Wireless Internet",Kitchen,"Free parking ... 3 1.0 Real Bed flexible False ... NaN 33.782712 -118.134410 Island style Spa Studio Long Beach 0 NaN 90804 0.0 2.0
1 14483613 4.007333 House Private room {"Wireless Internet","Air conditioning",Kitche... 4 2.0 Real Bed strict False ... 2017-09-17 40.705468 -73.909439 Beautiful and Simple Room W/2 Beds, 25 Mins to... Ridgewood 38 86.0 11385 1.0 2.0
2 10412649 7.090077 Apartment Entire home/apt {TV,"Wireless Internet","Air conditioning",Kit... 6 2.0 Real Bed flexible False ... NaN 38.917537 -77.031651 2br/2ba luxury condo perfect for infant / toddler U Street Corridor 0 NaN 20009 2.0 2.0
3 17954362 3.555348 House Private room {TV,"Cable TV",Internet,"Wireless Internet","A... 1 1.0 Real Bed flexible True ... 2017-09-29 40.736001 -73.924248 Manhattan view from Queens. Lovely single room . Sunnyside 19 96.0 11104 1.0 1.0
4 9969781 5.480639 House Entire home/apt {TV,"Cable TV",Internet,"Wireless Internet",Ki... 4 1.0 Real Bed moderate True ... 2017-08-28 37.744896 -122.430665 Zen Captured Noe Valley House Noe Valley 15 96.0 94131 2.0 2.0

5 rows × 28 columns

Exploration¶

In [ ]:
# Plot de la distribution des prix (log_price)
plt.figure(figsize=(10, 8))
sns.histplot(airbnb["log_price"], kde=True, bins=30)
plt.title('Distribution des prix')
plt.xlabel('Log Price')
plt.ylabel('Density')
plt.show()
No description has been provided for this image
In [ ]:
# Heatmap
latitude_mean = airbnb['latitude'].mean()
longitude_mean = airbnb['longitude'].mean()

m = folium.Map(location=[latitude_mean, longitude_mean], zoom_start=5)

heat_data = [[row['latitude'], row['longitude']] for index, row in airbnb.iterrows()]

HeatMap(heat_data).add_to(m)

# Clusters
marker_cluster = MarkerCluster().add_to(m)

# Marqueurs aux clusters
for index, row in airbnb.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['name'],
    ).add_to(marker_cluster)

m
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
# Plot catplot pour les villes
sns.set(font_scale=1.5)
plt.figure(figsize=(14, 8))
sns.catplot(x='city', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distribution des villes')
plt.xlabel('City')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par ville
plt.figure(figsize=(16, 8))
sns.violinplot(x='city', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix par ville')
plt.xlabel('City')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour le type de propriété
sns.set(font_scale=1.5)
plt.figure(figsize=(14, 8))
sns.catplot(x='property_type', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distirbution par type de propriete')
plt.xlabel('Property Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par property_type
plt.figure(figsize=(14, 8))
sns.violinplot(x='property_type', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix par type de propriete')
plt.xlabel('Property Type')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
# Types de propriété à inclure dans le violin plot
important_property_types = ['Apartment', 'House', 'Condominium', 'Townhouse', 'Loft']

# Filtrer les données pour inclure seulement les types de propriété importants
filtered_airbnb = airbnb[airbnb['property_type'].isin(important_property_types)]

# Plot violinplot pour le log_price par property_type
plt.figure(figsize=(14, 8))
sns.violinplot(x='property_type', y='log_price', data=filtered_airbnb, palette='rocket')
plt.title('Distribution du logprix pour les type de propriete les plus nombreux')
plt.xlabel('Property Type')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour le type de chambre
sns.set(font_scale=1.5)
plt.figure(figsize=(14, 8))
sns.catplot(x='room_type', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distribution du type de chambre')
plt.xlabel('Room Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
room_type_counts = airbnb['room_type'].value_counts()

# Plot pie chart pour le type de chambre
plt.figure(figsize=(8, 8))
plt.pie(room_type_counts, labels=room_type_counts.index, autopct='%1.2f%%', colors=sns.color_palette('rocket', len(room_type_counts)))
plt.title('Distribution du type de chambre')
plt.axis('equal')
plt.show()
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par room_type
plt.figure(figsize=(12, 8))
sns.violinplot(x='room_type', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction du type de chambre ')
plt.xlabel('Room Type')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
#Relation entre la note des avis (review_scores_rating) et le prix (log_price) par type de chambre
sns.relplot(x='review_scores_rating', y='log_price', hue='room_type', data=airbnb, height=8, palette='rocket')
plt.title('Relation entre la note des avis (review_scores_rating) et le prix (log_price) par type de chambre')
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les politiques d'annulation
plt.figure(figsize=(10, 8))
sns.catplot(x='cancellation_policy', kind='count', data=airbnb, palette='rocket')
plt.title('Distribution des politiques dannulation')
plt.xlabel('Cancellation Policy')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
<Figure size 1000x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par cancellation_policy
plt.figure(figsize=(12, 8))
sns.violinplot(x='cancellation_policy', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction des politiques dannulation')
plt.xlabel('Cancellation Policy')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les frais de nettoyage (cleaning_fee)
plt.figure(figsize=(10, 8))
sns.catplot(x='cleaning_fee', kind='count', data=airbnb, palette='rocket')
plt.title('Distribution des frais de nettoyage')
plt.xlabel('Cleaning Fee Included')
plt.ylabel('Count')
plt.show()
<Figure size 1000x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par cleaning_fee
plt.figure(figsize=(12, 8))
sns.violinplot(x='cleaning_fee', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction des frais de nettoyage')
plt.xlabel('Cleaning Fee Included')
plt.ylabel('Log Price')
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les types de lit (bed_type)
plt.figure(figsize=(10, 8))
sns.catplot(x='bed_type', kind='count', data=airbnb, palette='rocket')
plt.title('Distribution des types de lit')
plt.xlabel('Bed Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
<Figure size 1000x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par bed_type
plt.figure(figsize=(12, 8))
sns.violinplot(x='bed_type', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction des types de lit')
plt.xlabel('Bed Type')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
sns.color_palette("rocket")
# Plot catplot pour les notes
plt.figure(figsize=(14, 8))
sns.catplot(x='review_scores_rating', kind='count', data=airbnb, height=6, aspect=3)
plt.title('Distribution des notes')
plt.xlabel('Review Scores Rating')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot catplot pour les notes par groupées par dizaines
airbnb['rating_group'] = (airbnb['review_scores_rating'] // 10) * 10

airbnb['rating_group'] = airbnb['rating_group'].fillna('Missing')


plt.figure(figsize=(12, 8))
sns.catplot(x='rating_group', kind='count', data=airbnb, palette='rocket')
plt.title('Distribution des notes par groupe de dizaine')
plt.xlabel('Review Scores Rating Group')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par rating_group
airbnb['rating_group'] = (airbnb['review_scores_rating'] // 10) * 10

airbnb['rating_group'] = airbnb['rating_group'].fillna(-1)

plt.figure(figsize=(12, 8))
sns.violinplot(x='rating_group', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction des notes')
plt.xlabel('Review Scores Rating Group')
plt.ylabel('Log Price')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les accommodates
plt.figure(figsize=(14, 8))
sns.catplot(x='accommodates', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distribution des Accommodates')
plt.xlabel('Accommodates')
plt.ylabel('Count')
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par accommodates
plt.figure(figsize=(14, 8))
sns.violinplot(x='accommodates', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction des accommodates')
plt.xlabel('Accommodates')
plt.ylabel('Log Price')
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les bedrooms (nombre de chambres)
plt.figure(figsize=(14, 8))
sns.catplot(x='bedrooms', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distribution du nombre de chambres')
plt.xlabel('Bedrooms')
plt.ylabel('Count')
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par bedrooms
plt.figure(figsize=(14, 8))
sns.violinplot(x='bedrooms', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction du nombre de chambres')
plt.xlabel('Bedrooms')
plt.ylabel('Log Price')
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les beds (nombre de lit)
plt.figure(figsize=(14, 8))
sns.catplot(x='beds', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distribution du nombre de lits')
plt.xlabel('Beds')
plt.ylabel('Count')
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour le log_price par beds
plt.figure(figsize=(14, 8))
sns.violinplot(x='beds', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction du nombre de lits')
plt.xlabel('Beds')
plt.ylabel('Log Price')
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les num_amenities

airbnb['num_amenities'] = airbnb['amenities'].apply(lambda x: len(x.strip('{}').split(',')))


plt.figure(figsize=(14, 8))
sns.catplot(x='num_amenities', kind='count', data=airbnb, height=6, aspect=2, palette='rocket')
plt.title('Distribution du nombre dAmenities')
plt.xlabel('Number of Amenities')
plt.ylabel('Count')
plt.show()
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [ ]:
# Plot violinplot pour num_amenities en fonction de log_price

airbnb['num_amenities'] = airbnb['amenities'].apply(lambda x: len(x.strip('{}').split(',')))


plt.figure(figsize=(50, 8))
sns.violinplot(x='num_amenities', y='log_price', data=airbnb, palette='rocket')
plt.title('Distribution du logprix en fonction du nombre dAmenities')
plt.xlabel('Number of Amenities')
plt.ylabel('Log Price')
plt.show()
No description has been provided for this image
In [ ]:
# Plot catplot pour les types d'amenities les plus fréquents
airbnb['amenities_list'] = airbnb['amenities'].apply(lambda x: x.strip('{}').replace('"', '').split(','))

unique_amenities = set()
for amenities in airbnb['amenities_list']:
    unique_amenities.update(amenities)

amenities_count = {}
for amenity in unique_amenities:
    amenities_count[amenity] = sum(airbnb['amenities_list'].apply(lambda x: amenity in x))

amenities_df = pd.DataFrame(list(amenities_count.items()), columns=['Amenity', 'Count'])

top_20_amenities = amenities_df.sort_values(by='Count', ascending=False).head(20)

plt.figure(figsize=(12, 10))
sns.barplot(x='Count', y='Amenity', data=top_20_amenities, palette='rocket')
plt.title('Top 20 amenities les plus communs')
plt.xlabel('Count')
plt.ylabel('Amenity')
plt.show()
No description has been provided for this image
In [ ]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

x = airbnb['review_scores_rating']
y = airbnb['accommodates']
z = airbnb['log_price']

scatter = ax.scatter(x, y, z, c=z, cmap='rocket', marker='o', alpha=0.6)

ax.set_xlabel('Review Scores Rating')
ax.set_ylabel('Accommodates')
ax.set_zlabel('Log Price')
ax.set_title('visualisation de la relation entre Review Scores, Accommodates et Log Price')

cbar = fig.colorbar(scatter)
cbar.set_label('Log Price')

plt.show()
No description has been provided for this image
In [ ]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

x = airbnb['number_of_reviews']
y = airbnb['review_scores_rating']
z = airbnb['log_price']

scatter = ax.scatter(x, y, z, c=z, cmap='rocket', marker='o', alpha=0.6)

ax.set_xlabel('Number of Reviews')
ax.set_ylabel('Review Scores Rating')
ax.set_zlabel('Log Price')
ax.set_title('Relation entre Number of Reviews, Review Scores et Log Price')

cbar = fig.colorbar(scatter)

plt.show()
No description has been provided for this image

Entrainement¶

Standardiser (peut etre mettre une fonction)¶

In [ ]:
for column in airbnb.columns:

    if airbnb[column].isnull().sum() != 0:

        print("\n{} :- {},  dtypes : {}".format(column,airbnb[column].isnull().sum(),airbnb[column].dtypes))
bathrooms :- 51,  dtypes : float64

first_review :- 4725,  dtypes : object

host_has_profile_pic :- 56,  dtypes : object

host_identity_verified :- 56,  dtypes : object

host_response_rate :- 5475,  dtypes : object

host_since :- 56,  dtypes : object

last_review :- 4716,  dtypes : object

neighbourhood :- 2086,  dtypes : object

review_scores_rating :- 4978,  dtypes : float64

zipcode :- 303,  dtypes : object

bedrooms :- 26,  dtypes : float64

beds :- 35,  dtypes : float64
In [ ]:
nb_amenities = []
for i in airbnb["amenities"]:
    nb_amenities.append(len(i))

airbnb["amenities"] = nb_amenities
In [ ]:
categorical_col = []
numerical_col = []
for column in airbnb.columns:

    if airbnb[column].dtypes != "float64" and airbnb[column].dtypes != "int64":
        categorical_col.append(column)
    else:
        numerical_col.append(column)
In [ ]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [ ]:
for col in categorical_col:
    airbnb[col] = le.fit_transform(airbnb[col])

Affichage correlation¶

In [ ]:
plt.figure(figsize = (40,40))
sns.heatmap(airbnb.corr(), annot=True, fmt=".2f", cmap="seismic")
plt.show()
No description has been provided for this image

Conclusion¶

In [ ]:
x = airbnb.drop(["id", "name", "log_price", "description", "first_review", "host_since", "last_review", "neighbourhood", "zipcode"], axis=1)
y = airbnb.log_price

x = x.apply(lambda col: col.fillna(col.mean()) if col.dtype in [np.float64, np.int64] else col.fillna(col.mode()[0]))
print(x.head())
   property_type  room_type  amenities  accommodates  bathrooms  bed_type  \
0             16          1        194             3        1.0         4   
1             16          1        410             4        2.0         4   
2              0          0        323             6        2.0         4   
3             16          1        426             1        1.0         4   
4             16          0        321             4        1.0         4   

   cancellation_policy  cleaning_fee  city  host_has_profile_pic  \
0                    0             0     3                     1   
1                    2             0     4                     1   
2                    0             0     2                     1   
3                    0             1     4                     1   
4                    1             1     5                     1   

   host_identity_verified  host_response_rate  instant_bookable   latitude  \
0                       0                  70                 1  33.782712   
1                       1                   2                 1  40.705468   
2                       0                  70                 1  38.917537   
3                       1                   2                 0  40.736001   
4                       1                   2                 0  37.744896   

    longitude  number_of_reviews  review_scores_rating  bedrooms  beds  
0 -118.134410                  0             94.069077       0.0   2.0  
1  -73.909439                 38             86.000000       1.0   2.0  
2  -77.031651                  0             94.069077       2.0   2.0  
3  -73.924248                 19             96.000000       1.0   1.0  
4 -122.430665                 15             96.000000       2.0   2.0  

Regression linéaire¶

In [ ]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=101)
In [ ]:
lr = LinearRegression()
lr.fit(x_train,y_train)

y_pred_lr = lr.predict(x_test)
In [ ]:
r2_lr = metrics.r2_score(y_test, y_pred_lr)

print('\nScore R2 de la regression linéaire : ', r2_lr)
Score R2 de la regression linéaire :  0.5261756962790156
In [ ]:
airbnb_test = pd.read_csv("airbnb_test.csv")

for col in categorical_col:
    if col in airbnb_test.columns:
        airbnb_test[col] = le.fit_transform(airbnb_test[col].astype(str))

amenities_count = []
for i in airbnb_test["amenities"]:
    amenities_count.append(len(i))

airbnb_test["amenities"] = amenities_count
airbnb_test = airbnb_test[x.columns]

airbnb_test = airbnb_test.apply(lambda col: col.fillna(col.mean()) if col.dtype in [np.float64, np.int64] else col.fillna(col.mode()[0]))
airbnb_test_pred = lr.predict(airbnb_test)

print('\nPredictions de airbnb_test :')
print(airbnb_test_pred)
Predictions de airbnb_test :
[5.18052301 5.55832918 5.08000676 ... 4.78966923 4.33535031 5.14220953]

Regression multiléaire¶

In [ ]:
from sklearn.linear_model import Ridge
model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('ridge', Ridge(fit_intercept=True))
])

param_grid = {
    'poly__degree': [1, 2, 3],
    'ridge__alpha': [0.1, 0.5, 1.0, 2.0]
}

poly_tuned = GridSearchCV(model, param_grid, cv=5)
In [ ]:
from sklearn.linear_model import Ridge
model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('ridge', Ridge(fit_intercept=True))
])

param_grid = {
    'poly__degree': [1, 2, 3],
    'ridge__alpha': [0.1, 0.5, 1.0, 2.0]
}

poly_tuned = GridSearchCV(model, param_grid, cv=5)
In [ ]:
poly_tuned.fit(x_train, y_train)

y_pred_poly = poly_tuned.predict(x_test)
In [ ]:
r2_poly   = metrics.r2_score(y_test, y_pred_poly)

print('\nScore R2 de la régression polynomiale : ', r2_poly)
Score R2 de la régression polynomiale :  0.5782025623013587
In [ ]:
airbnb_test_pred = poly_tuned.predict(airbnb_test)

# Afficher les prédictions
print('\nPredictions de airbnb_test :')
print(airbnb_test_pred)
Predictions de airbnb_test :
[5.27935138 5.79454588 5.34103185 ... 5.06122197 4.61032603 5.50887927]

Méthodes de gradient¶

In [ ]:
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
In [ ]:
gb.fit(x_train, y_train)

y_pred_gb = gb.predict(x_test)
In [ ]:
r2_gb   = metrics.r2_score(y_test, y_pred_gb)

print('\nScore R2 de gradient "boosté" : ', r2_gb)
Score R2 de gradient "boosté" :  0.6592827837059045
In [ ]:
airbnb_test_pred = gb.predict(airbnb_test)

# Afficher les prédictions
print('\nPredictions de airbnb_test:')
print(airbnb_test_pred)
Predictions de airbnb_test:
[4.95065191 5.63269307 5.05190987 ... 4.99656526 4.35996765 5.25623102]

Forêt d’arbres de décision¶

In [ ]:
rf = RandomForestRegressor()
In [ ]:
rf.fit(x_train,y_train)

y_pred_rf = rf.predict(x_test)
In [ ]:
r2_rf = metrics.r2_score(y_test, y_pred_rf)

print('\nScore R2 de la Forêt d’arbres de décision : ', r2_rf)
Score R2 de la Forêt d’arbres de décision :  0.6724632688442183
In [ ]:
airbnb_test_foret = rf.predict(airbnb_test)

print('\nPredictions de airbnb_test :')
print(airbnb_test_foret)
Predictions de airbnb_test :
[4.86502685 5.92150521 4.92577966 ... 5.03177963 4.2199774  5.43019803]
In [ ]:
prediction_example = pd.read_csv("prediction_example.csv")
prediction_example["logpred"] = airbnb_test_foret

prediction_example.to_csv("MaPredictionFinale.csv", index=False) # index=False pour éviter d’ajouter l’index interne à pandas
# Voilà !
In [ ]:
def estConforme(monFichier_csv):
    votre_prediction = pd.read_csv(monFichier_csv)

    fichier_exemple = pd.read_csv("prediction_example.csv")

    assert votre_prediction.columns[1] == fichier_exemple.columns[1], f"Attention, votre colonne de prédiction doit s'appeler {fichier_exemple.columns[1]}, elle s'appelle '{votre_prediction.columns[1]}'"
    assert len(votre_prediction) == len(fichier_exemple), f"Attention, vous devriez avoir {len(fichier_exemple)} prédiction dans votre fichier, il en contient '{len(votre_prediction)}'"

    assert np.all(votre_prediction.iloc[:,0] == fichier_exemple.iloc[:, 0])

    print("Fichier conforme!")

estConforme("MaPredictionFinale.csv")
Fichier conforme!